{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualizing Word Embeddings on the Tensorboard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import os\n",
    "import zipfile\n",
    "from tensorflow.contrib.tensorboard.plugins import projector\n",
    "import csv\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read the GloVe file\n",
    "\n",
    "Here we first need to download the GloVe word embeddings (`glove.6B.zip`) found at this [website](https://nlp.stanford.edu/projects/glove/). Then we read the GloVe file to get the first 50000 words in the file. We will be using 50 dimensional word vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ".....\tDone\n"
     ]
    }
   ],
   "source": [
    "vocabulary_size = 50000\n",
    "\n",
    "pret_embeddings = np.empty(shape=(vocabulary_size,50),dtype=np.float32)\n",
    "\n",
    "words = [] \n",
    "\n",
    "word_idx = 0\n",
    "# Open the zip file\n",
    "with zipfile.ZipFile('glove.6B.zip') as glovezip:\n",
    "    # Read the file with 50 dimensional embeddings\n",
    "    with glovezip.open('glove.6B.50d.txt') as glovefile:\n",
    "        # Read line by line\n",
    "        for li, line in enumerate(glovefile):\n",
    "            # Print progress\n",
    "            if (li+1)%10000==0: print('.',end='')\n",
    "                \n",
    "            # Get the word and the corresponding vector\n",
    "            line_tokens = line.decode('utf-8').split(' ')\n",
    "            word = line_tokens[0]\n",
    "            vector = [float(v) for v in line_tokens[1:]]\n",
    "            \n",
    "            assert len(vector)==50\n",
    "            words.append(word)\n",
    "            # Update the embedding matrix\n",
    "            pret_embeddings[word_idx,:] = np.array(vector)\n",
    "            word_idx += 1\n",
    "            # If the first 50000 words being read, finish\n",
    "            if word_idx == vocabulary_size:\n",
    "                break\n",
    "                \n",
    "print('\\tDone')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create TensorFlow Variable\n",
    "\n",
    "Here we create a TensorFlow variable to store the embeddings we read above and save it to the disk. This is necessary for the visualization."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a directory to save our model\n",
    "log_dir = 'models'\n",
    "if not os.path.exists(log_dir):\n",
    "    os.mkdir(log_dir)\n",
    "\n",
    "tf.reset_default_graph()\n",
    "\n",
    "# Create a Tensorflow variable initialized with the word embedings we just read in\n",
    "embeddings = tf.get_variable('embeddings',shape=[vocabulary_size, 50],\n",
    "                             initializer=tf.constant_initializer(pret_embeddings))\n",
    "\n",
    "session = tf.InteractiveSession()\n",
    "\n",
    "tf.global_variables_initializer().run()\n",
    "\n",
    "# Define a saver, that will save the Tensorflow variables to a given location\n",
    "saver = tf.train.Saver({'embeddings':embeddings})\n",
    "# Save the file\n",
    "saver.save(session, os.path.join(log_dir, \"model.ckpt\"), 0)\n",
    "\n",
    "# Define metadata for word embeddings\n",
    "with open(os.path.join(log_dir,'metadata.tsv'), 'w',encoding='utf-8') as csvfile:\n",
    "    writer = csv.writer(csvfile, delimiter='\\t',\n",
    "                            quotechar='|', quoting=csv.QUOTE_MINIMAL)\n",
    "    writer.writerow(['Word','Word ID'])\n",
    "    for wi,w in enumerate(words):\n",
    "      writer.writerow([w,wi])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the configuration to tell the Tensorboard where and what to look"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "config = projector.ProjectorConfig()\n",
    "\n",
    "# You can add multiple embeddings. Here we add only one.\n",
    "embedding_config = config.embeddings.add()\n",
    "embedding_config.tensor_name = embeddings.name\n",
    "# Link this tensor to its metadata file (e.g. labels).\n",
    "embedding_config.metadata_path = 'metadata.tsv'\n",
    "\n",
    "# Use the same LOG_DIR where you stored your checkpoint.\n",
    "summary_writer = tf.summary.FileWriter(log_dir)\n",
    "\n",
    "# The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will\n",
    "# read this file during startup.\n",
    "projector.visualize_embeddings(summary_writer, config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
